Main question to address: What areas of NYC are under-serviced by ADA accessible stations?
Plan: Somehow randomly “sample” locations in the NYC territory in (longitude, latitude) unit pairs and then calculate distance from that point to the nearest ada-certified (accessible) subway station. Normalize to nearest (accessible or not accessible) station?
Need: - Data for which stations are ADA-accessible - Data for station locations (longitude, latitude) - Some way to sample the territory of NYC (it’s an odd shape)
library(tidyverse)
library(ggmap)
library(cowplot)
library(viridis)
library(geosphere)
library(rgdal)
library(tmap)
Not sure which are the most reliable datasets to use here.
ada_raw <- read_csv("./data/Elevator Escalator Station Data.csv") %>%
arrange(station)
subloc_raw <- read_csv("./data/NYC_Transit_Subway_Entrance_And_Exit_Data.csv") %>%
arrange(`Station Name`) #%>%
#mutate_all(str_to_lower)
set.seed(2018)
nyc311_loc <- read_csv(file = "./data/unique_nyc_2015_311.csv") %>%
distinct() %>%
sample_frac(0.25, replace = FALSE)
colnames(subloc_raw) <- colnames(subloc_raw) %>%
str_to_lower() %>%
gsub(pattern = " ", replacement = "_")
colnames(ada_raw) <- colnames(ada_raw) %>%
str_to_lower()
colnames(nyc311_loc) <- colnames(nyc311_loc) %>%
str_to_lower()
glimpse(ada_raw)
Observations: 570
Variables: 7
$ station <chr> "125 St", "125 St", "125 St", "125 St", "125 St"...
$ borough <chr> "MN", "MN", "MN", "MN", "MN", "MN", "MN", "MN", ...
$ trainno <chr> "A/B/C/D", "A/B/C/D", "4/5/6/METRO-NORTH", "1", ...
$ equipmentno <chr> "EL143", "EL144", "EL126", "ES101", "ES103", "EL...
$ equipmenttype <chr> "EL", "EL", "EL", "ES", "ES", "EL", "EL", "ES", ...
$ serving <chr> "MEZZANINE AND DOWNTOWN", "MEZZANINE AND UPTOWN"...
$ ada <chr> "Y", "Y", "Y", "N", "N", "Y", "Y", "N", "Y", "Y"...
glimpse(subloc_raw)
Observations: 1,868
Variables: 32
$ division <chr> "IND", "IRT", "IRT", "IRT", "IRT", "IRT", "...
$ line <chr> "8 Avenue", "Broadway-7th Ave", "Broadway-7...
$ station_name <chr> "103rd St", "103rd St", "103rd St", "103rd ...
$ station_latitude <dbl> 40.79609, 40.79945, 40.79945, 40.79945, 40....
$ station_longitude <dbl> -73.96145, -73.96838, -73.96838, -73.96838,...
$ route1 <chr> "B", "1", "1", "1", "1", "1", "7", "7", "7"...
$ route2 <chr> "C", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
$ route3 <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ route4 <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ route5 <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ route6 <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ route7 <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ route8 <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ route9 <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ route10 <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ route11 <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ entrance_type <chr> "Stair", "Stair", "Stair", "Stair", "Stair"...
$ entry <chr> "YES", "YES", "YES", "YES", "YES", "NO", "Y...
$ exit_only <chr> NA, NA, NA, NA, NA, "Yes", NA, NA, NA, NA, ...
$ vending <chr> "YES", "YES", "YES", "YES", "YES", "NO", "Y...
$ staffing <chr> "FULL", "FULL", "FULL", "FULL", "FULL", "NO...
$ staff_hours <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ ada <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, F...
$ ada_notes <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ free_crossover <lgl> TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, T...
$ north_south_street <chr> "Central Park West", "Broadway", "Broadway"...
$ east_west_street <chr> "103rd St", "103rd St", "103rd St", "103rd ...
$ corner <chr> "NW", "NW", "NE", "NW", "NE", "SE", "SE", "...
$ entrance_latitude <dbl> 40.79626, 40.79928, 40.79914, 40.79941, 40....
$ entrance_longitude <dbl> -73.96161, -73.96868, -73.96831, -73.96861,...
$ station_location <chr> "(40.796092, -73.961454)", "(40.799446, -73...
$ entrance_location <chr> "(40.796263, -73.961610)", "(40.799282, -73...
glimpse(nyc311_loc)
Observations: 103,510
Variables: 3
$ borough <chr> "BROOKLYN", "QUEENS", "MANHATTAN", "BROOKLYN", "QUEE...
$ latitude <dbl> 40.60375, 40.76693, 40.79832, 40.62297, 40.68408, 40...
$ longitude <dbl> -73.99558, -73.88903, -73.94185, -73.91094, -73.8448...
Mucking around, trying to see what I have to work with.
# What are the cols and how many missing values per col?
subloc_raw <- subloc_raw %>%
replace_na(
list(
route1 = "", route2 = "", route3 = "", route4 = "",
route5 = "", route6 = "", route7 = "", route8 = "",
route9 = "", route10 = "", route11 = ""
)
) %>%
unite(col = routes, route1:route11, sep = "")
subloc_raw %>%
is.na %>%
as.data.frame() %>%
sapply(sum)
division line station_name
0 0 0
station_latitude station_longitude routes
0 0 0
entrance_type entry exit_only
0 0 1812
vending staffing staff_hours
0 0 1828
ada ada_notes free_crossover
0 1793 0
north_south_street east_west_street corner
29 35 32
entrance_latitude entrance_longitude station_location
0 0 0
entrance_location
0
ada_raw %>%
is.na %>%
as.data.frame() %>%
sapply(sum)
station borough trainno equipmentno equipmenttype
0 0 0 0 0
serving ada
0 0
# would help future steps to assign a borough to each station - can the ada_raw data be used?
subloc_raw %>%
select(line:station_longitude) %>%
distinct() %>%
anti_join(ada_raw, by = c("station_name" = "station"))
# A tibble: 418 x 4
line station_name station_latitude station_longitu~
<chr> <chr> <dbl> <dbl>
1 8 Avenue 103rd St 40.8 -74.0
2 Broadway-7th~ 103rd St 40.8 -74.0
3 Flushing 103rd St 40.7 -73.9
4 Lexington 103rd St 40.8 -73.9
5 Broadway Jam~ 104th St-102nd St 40.7 -73.8
6 Liberty 104th St-Oxford Av 40.7 -73.8
7 Lexington 110th St 40.8 -73.9
8 Lenox 110th St-Central Park ~ 40.8 -74.0
9 Broadway Jam~ 111th St 40.7 -73.8
10 Flushing 111th St 40.8 -73.9
# ... with 408 more rows
# not many station names match across the two datasets - would take some time to untangle - is there another way?
test <- subloc_raw %>%
select(line:station_longitude) %>%
distinct() %>%
mutate(join_key = paste(round(station_latitude, 2), round(station_longitude, 2), sep = "_")) %>%
left_join(nyc311_loc %>%
mutate(latitude = round(latitude, 2),
longitude = round(longitude, 2),
join_key = paste(latitude, longitude, sep = "_")) %>%
distinct(), by = "join_key")
# all of the stations can be assigned a location based on the rounded NYC 311 dataset!
subloc_bor <- subloc_raw %>%
mutate(join_key = paste(round(station_latitude, 2), round(station_longitude, 2), sep = "_")) %>%
left_join(nyc311_loc %>%
mutate(latitude = round(latitude, 2),
longitude = round(longitude, 2),
join_key = paste(latitude, longitude, sep = "_")) %>%
distinct(), by = "join_key") %>%
# cleanup
select(-join_key, -latitude, -longitude) %>%
select(borough, everything())
glimpse(subloc_bor)
Observations: 1,957
Variables: 23
$ borough <chr> "MANHATTAN", "MANHATTAN", "MANHATTAN", "MAN...
$ division <chr> "IND", "IRT", "IRT", "IRT", "IRT", "IRT", "...
$ line <chr> "8 Avenue", "Broadway-7th Ave", "Broadway-7...
$ station_name <chr> "103rd St", "103rd St", "103rd St", "103rd ...
$ station_latitude <dbl> 40.79609, 40.79945, 40.79945, 40.79945, 40....
$ station_longitude <dbl> -73.96145, -73.96838, -73.96838, -73.96838,...
$ routes <chr> "BC", "1", "1", "1", "1", "1", "7", "7", "7...
$ entrance_type <chr> "Stair", "Stair", "Stair", "Stair", "Stair"...
$ entry <chr> "YES", "YES", "YES", "YES", "YES", "NO", "Y...
$ exit_only <chr> NA, NA, NA, NA, NA, "Yes", NA, NA, NA, NA, ...
$ vending <chr> "YES", "YES", "YES", "YES", "YES", "NO", "Y...
$ staffing <chr> "FULL", "FULL", "FULL", "FULL", "FULL", "NO...
$ staff_hours <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ ada <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, F...
$ ada_notes <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ free_crossover <lgl> TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, T...
$ north_south_street <chr> "Central Park West", "Broadway", "Broadway"...
$ east_west_street <chr> "103rd St", "103rd St", "103rd St", "103rd ...
$ corner <chr> "NW", "NW", "NE", "NW", "NE", "SE", "SE", "...
$ entrance_latitude <dbl> 40.79626, 40.79928, 40.79914, 40.79941, 40....
$ entrance_longitude <dbl> -73.96161, -73.96868, -73.96831, -73.96861,...
$ station_location <chr> "(40.796092, -73.961454)", "(40.799446, -73...
$ entrance_location <chr> "(40.796263, -73.961610)", "(40.799282, -73...
# after investigating, there are 27 extra rows - some of the nyc 311 locations are wrongly labeled
The Subway Entrance and Exit data seems to be good enough for an initial analysis. Hopefully the ADA column in that dataset is reliable. Need to compare this with the other datasets collected at some point, but joining is a problem.
To do: - check against Wire Monkey’s joined dataset - figure out the station name inconsistencies?
For now, stick with the sub.location.raw.data for the rest of the analysis.
Map of all subway stations in this dataset:
nyc_map <- get_map(location = "New York City")
ggmap(nyc_map) +
geom_point(data = subloc_bor, aes(x = station_longitude, y = station_latitude, color = borough)) +
ylim(40.495992, 40.915568) + # NYC city limits latitude coordinates
xlim(-74.257159, -73.699215) + # NYC city limits longitude coordinates
xlab("longitude") +
ylab("latitude") +
ggtitle("All NYC Subway Stations\nin the Subway Entrance/Exit Locations Data") +
facet_wrap(~borough)
Warning: Removed 4 rows containing missing values (geom_rect).
#subloc_bor %>%
# ggplot(aes(x = station_longitude, y = station_latitude, color = borough)) +
# geom_point() +
# ylim(40.495992, 40.915568) + # NYC city limits latitude coordinates
# xlim(-74.257159, -73.699215) + # NYC city limits longitude coordinates
# xlab("longitude") +
# ylab("latitude") +
# ggtitle("All NYC Subway Stations\nin the Subway Entrance/Exit Locations Data") +
# facet_wrap(~borough)
# there's a couple of wrong assignments - try removing them:
subloc_bor <- subloc_bor %>%
filter(!(borough == "MANHATTAN" & station_latitude < 40.7)) %>%
filter(!(borough == "BRONX" & station_latitude < 40.8))
ggmap(nyc_map) +
geom_point(data = subloc_bor, aes(x = station_longitude, y = station_latitude, color = borough)) +
ylim(40.495992, 40.915568) + # NYC city limits latitude coordinates
xlim(-74.257159, -73.699215) + # NYC city limits longitude coordinates
xlab("longitude") +
ylab("latitude") +
ggtitle("All NYC Subway Stations\nin the Subway Entrance/Exit Locations Data") +
facet_wrap(~borough)
Warning: Removed 4 rows containing missing values (geom_rect).
test <- subloc_bor %>%
filter(borough == "BRONX" & station_latitude < 40.8)
ggmap(nyc_map) +
geom_point(data = test, aes(x = station_longitude, y = station_latitude)) +
ylim(40.495992, 40.915568) +
xlim(-74.257159, -73.699215)
Warning: Removed 1 rows containing missing values (geom_rect).
odd <- subloc_bor %>%
select(borough:station_name) %>%
distinct() %>%
group_by(division, line, station_name) %>%
count() %>%
filter(n > 1) %>%
arrange(station_name)
test2 <- subloc_bor %>%
select(borough:station_longitude) %>%
distinct() %>%
filter(division %in% odd$division & station_name %in% odd$station_name & line %in% odd$line) %>%
arrange(station_latitude, station_longitude, borough)
# doesn't matter if the two annotations are QUEENS and BROOKLYN - will be treated together, but mixups that involve BRONX and MANHATTAN would be helpful to fix
test3 <- test2 %>%
filter(station_latitude > 40.70442) %>%
arrange(station_longitude, station_latitude, borough)
test3.brnx.man <- test3 %>%
filter(borough == "MANHATTAN" | borough == "BROND")
test4 <- test3 %>%
filter(station_name %in% test3.brnx.man$station_name)
test4 %>%
select(division, line, station_name) %>%
distinct()
# A tibble: 7 x 3
division line station_name
<chr> <chr> <chr>
1 IND 6 Avenue East Broadway
2 IND 63rd Street Roosevelt Island
3 IRT Jerome 138th St
4 IRT Jerome 149th St-Grand Concourse
5 IRT Pelham 138th St-3rd Ave
6 IRT Broadway-7th Ave 207th St
7 IRT Broadway-7th Ave Marble Hill-225th St
subloc_bor <- subloc_bor %>%
filter(!(division == "IND" & line == "6 Avenue" & station_name == "East Broadway" & borough == "BROOKLYN")) %>%
filter(!(division == "IRT" & line == "Jerome" & station_name == "149th St-Grand Concourse" & borough == "MANHATTAN")) %>%
filter(!(division == "IRT" & line == "Pelham" & station_name == "138th St-3rd Ave" & borough == "MANHATTAN")) %>%
filter(!(division == "IRT" & line == "Broadway-7th Ave" & station_name == "207th St" & borough == "BRONX"))
# The F train Roosevelt Island stop is both Queens and Manhattan - leave it as such
# The Marble Hill - 225th St stop is also an odd station - leave as both Manhattan and Bronx
ggmap(nyc_map) +
geom_point(data = subloc_bor, aes(x = station_longitude, y = station_latitude, color = borough)) +
ylim(40.495992, 40.915568) + # NYC city limits latitude coordinates
xlim(-74.257159, -73.699215) + # NYC city limits longitude coordinates
xlab("longitude") +
ylab("latitude") +
ggtitle("All NYC Subway Stations\nin the Subway Entrance/Exit Locations Data") +
facet_wrap(~borough)
Warning: Removed 4 rows containing missing values (geom_rect).
Notes: - No stations on Staten Island - will exclude Saten Island from this point on - Most stations have more than one row (multiple entrances/exits) - need to compress info - Station names in the station column are NOT unique (multiple 103rd St, etc) in the entrance/exit data, but station location may be a unique and consistent handle to grab all the entrances/exits associated with a station - For now, will call any station that has at least one ADA-accessible entrance/exit “accessible”, but we know that some stations are only partially accessible. For some station, different subway lines are serviced by different platforms within the same station, and not all platforms are fully accessible. This is where the elevator/escalator data joining may be useful.
# some info about entrances
subloc_bor %>%
ggplot(aes(x = entrance_type, fill = ada)) +
geom_bar() +
xlab("Entrance Type")
# same scale
subloc_bor %>%
ggplot(aes(x = entrance_type, fill = ada)) +
geom_bar(position = "fill") +
xlab("Entrance Type") +
ylab("Proportion of Entrance that is ADA-Accessible")
According to the description of the variables in this dataset, ADA == TRUE indicates that the station is ADA-Accessible, but not necessarily if that entrance is
ggmap(nyc_map) +
geom_point(
data = subloc_bor,
aes(x = station_longitude, y = station_latitude, color = ada),
size = 1.5,
alpha = 0.5) +
ylim(40.55, 40.915568) + # NYC city limits latitude coordinates
xlim(-74.1, -73.699215) + # NYC city limits longitude coordinates
xlab("longitude") +
ylab("latitude") +
ggtitle("All NYC Subway Stations\nIn the Subway Entrance/Exit Locations Dataset\nColor by Accessibility")
Warning: Removed 1 rows containing missing values (geom_rect).
# this data is missing the new Q-line information
Now the next questions is how do I “sample” points from the area of NYC?
I’ve worked a little bit with the NYC 311 call dataset before. Source: https://nycopendata.socrata.com/Social-Services/311-Service-Requests-from-2010-to-Present/erm2-nwe9
Remembered that it has latitude/longitude location info for the calls. Grabbed the locations from the 2015 NYC 311 calls data - use as random sample?
nyc311_loc %>%
ggplot(aes(x = longitude, y = latitude)) +
geom_bin2d(bins = 400) +
coord_quickmap() +
scale_fill_viridis() +
ggtitle("Locations in the 2015 NYC 311 Calls Dataset")
Not bad. Empty areas are parks probably, should be okay to not include those.
# create unique handle using station name, division, and line columns
subloc_bor <- subloc_bor %>%
mutate(station_id = paste(division, line, station_name, sep = " / ")) %>%
select(borough, station_id, everything())
subloc_sml <- subloc_bor %>%
select(station_id, borough:routes, ada) %>%
distinct()
glimpse(subloc_sml)
Observations: 495
Variables: 9
$ station_id <chr> "IND / 8 Avenue / 103rd St", "IRT / Broadway...
$ borough <chr> "MANHATTAN", "MANHATTAN", "QUEENS", "MANHATT...
$ division <chr> "IND", "IRT", "IRT", "IRT", "BMT", "IND", "I...
$ line <chr> "8 Avenue", "Broadway-7th Ave", "Flushing", ...
$ station_name <chr> "103rd St", "103rd St", "103rd St", "103rd S...
$ station_latitude <dbl> 40.79609, 40.79945, 40.74986, 40.79060, 40.6...
$ station_longitude <dbl> -73.96145, -73.96838, -73.86270, -73.94748, ...
$ routes <chr> "BC", "1", "7", "6", "JZ", "A", "6", "23", "...
$ ada <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FA...
not.uni <- subloc_sml %>%
group_by(station_id) %>%
count() %>%
filter(n > 1)
subloc_sml_prob <- subloc_sml %>%
filter(station_id %in% not.uni$station_id) %>%
arrange(station_id)
# there's some dublicate stations still, must have had several station locations associated with those
subloc_sml <- subloc_sml %>%
group_by(station_id) %>%
mutate(avg_lat = mean(station_latitude)) %>%
mutate(avg_long = mean(station_longitude)) %>%
select(station_id:borough, avg_lat:avg_long, routes, ada) %>%
ungroup() %>%
distinct()
nrow(subloc_sml)
[1] 485
length(subloc_sml$station_id)
[1] 485
ggmap(nyc_map) +
geom_point(data = subloc_sml, aes(x = avg_long, y = avg_lat, color = borough)) +
ylim(40.495992, 40.915568) + # NYC city limits latitude coordinates
xlim(-74.257159, -73.699215) + # NYC city limits longitude coordinates
xlab("longitude") +
ylab("latitude") +
ggtitle("All NYC Subway Stations\nin the Subway Entrance/Exit Locations Data") +
facet_wrap(~borough)
Warning: Removed 4 rows containing missing values (geom_rect).
There’s still some duplicates in the stations that have slightly different route assignments, but similar location. I’m going to leave them in for now, not sure what to do with them.
Small-scale test on a tiny portion of the subway stations and the 311 locations:
# small scale test
# pick 10 locations in NYC and find the distance from that point to 5 stations in the city
loc_test <- nyc311_loc %>%
sample_n(10, replace = FALSE)
t1 <- rbind(subloc_sml$station_id, subloc_sml$station_id)
t2 <- as.data.frame(rbind(t1, t1, t1, t1, t1), stringsAsFactors = FALSE)
colnames(t2) <- paste("stat", 1:ncol(t2), sep = "")
t3 <- t2 %>% select(1:5)
t3
stat1 stat2
1 IND / 8 Avenue / 103rd St IRT / Broadway-7th Ave / 103rd St
2 IND / 8 Avenue / 103rd St IRT / Broadway-7th Ave / 103rd St
3 IND / 8 Avenue / 103rd St IRT / Broadway-7th Ave / 103rd St
4 IND / 8 Avenue / 103rd St IRT / Broadway-7th Ave / 103rd St
5 IND / 8 Avenue / 103rd St IRT / Broadway-7th Ave / 103rd St
6 IND / 8 Avenue / 103rd St IRT / Broadway-7th Ave / 103rd St
7 IND / 8 Avenue / 103rd St IRT / Broadway-7th Ave / 103rd St
8 IND / 8 Avenue / 103rd St IRT / Broadway-7th Ave / 103rd St
9 IND / 8 Avenue / 103rd St IRT / Broadway-7th Ave / 103rd St
10 IND / 8 Avenue / 103rd St IRT / Broadway-7th Ave / 103rd St
stat3 stat4
1 IRT / Flushing / 103rd St IRT / Lexington / 103rd St
2 IRT / Flushing / 103rd St IRT / Lexington / 103rd St
3 IRT / Flushing / 103rd St IRT / Lexington / 103rd St
4 IRT / Flushing / 103rd St IRT / Lexington / 103rd St
5 IRT / Flushing / 103rd St IRT / Lexington / 103rd St
6 IRT / Flushing / 103rd St IRT / Lexington / 103rd St
7 IRT / Flushing / 103rd St IRT / Lexington / 103rd St
8 IRT / Flushing / 103rd St IRT / Lexington / 103rd St
9 IRT / Flushing / 103rd St IRT / Lexington / 103rd St
10 IRT / Flushing / 103rd St IRT / Lexington / 103rd St
stat5
1 BMT / Broadway Jamaica / 104th St-102nd St
2 BMT / Broadway Jamaica / 104th St-102nd St
3 BMT / Broadway Jamaica / 104th St-102nd St
4 BMT / Broadway Jamaica / 104th St-102nd St
5 BMT / Broadway Jamaica / 104th St-102nd St
6 BMT / Broadway Jamaica / 104th St-102nd St
7 BMT / Broadway Jamaica / 104th St-102nd St
8 BMT / Broadway Jamaica / 104th St-102nd St
9 BMT / Broadway Jamaica / 104th St-102nd St
10 BMT / Broadway Jamaica / 104th St-102nd St
dist_test <- bind_cols(loc_test, t3) %>%
gather(key = "station.num", value = "station_id", 4:8) %>%
arrange(station.num)
# each of the 10 (long,lat) points is now paired with each of the 5 stations selected
dist_test
# A tibble: 50 x 5
borough latitude longitude station.num station_id
<chr> <dbl> <dbl> <chr> <chr>
1 BRONX 40.9 -73.9 stat1 IND / 8 Avenue / 103rd St
2 BROOKLYN 40.6 -73.9 stat1 IND / 8 Avenue / 103rd St
3 MANHATTAN 40.9 -73.9 stat1 IND / 8 Avenue / 103rd St
4 BROOKLYN 40.6 -74.0 stat1 IND / 8 Avenue / 103rd St
5 QUEENS 40.8 -73.9 stat1 IND / 8 Avenue / 103rd St
6 QUEENS 40.6 -73.9 stat1 IND / 8 Avenue / 103rd St
7 QUEENS 40.8 -73.9 stat1 IND / 8 Avenue / 103rd St
8 BROOKLYN 40.6 -73.9 stat1 IND / 8 Avenue / 103rd St
9 QUEENS 40.7 -73.8 stat1 IND / 8 Avenue / 103rd St
10 QUEENS 40.7 -73.9 stat1 IND / 8 Avenue / 103rd St
# ... with 40 more rows
# link the station.id to a (long,lat) pair for that station:
dist_test_w_stat_loc <- dist_test %>%
inner_join(subloc_sml, by = "station_id")
# now calculate distance between point and the station using the geosphere package:
dist.test.res <- dist_test_w_stat_loc %>%
mutate(
dist.to.stat = distGeo(
# p1 = NYC 311 location
p1 = as.matrix(dist_test_w_stat_loc %>% select(longitude, latitude)),
# p2 = subway station location
p2 = as.matrix(dist_test_w_stat_loc %>% select(avg_long, avg_lat))
) * 0.000621371 # convert meters to miles
) %>%
mutate(loc.id = paste(borough.y, latitude, longitude, sep = " / "))
Note: the distances are based on direct distance from point to point
To do: - Is there some way to incorporate Manhattan Distance into the calculation to get a more accurate distance measurement?
What’s the distance from the NYC 311 point to the closest station (out of the 5):
dim(dist.test.res)
[1] 50 12
dist.test.nearest <- dist.test.res %>%
group_by(loc.id) %>%
summarize(nearest.stat = min(dist.to.stat)) %>%
ungroup() %>%
inner_join((dist.test.res %>% select(longitude, latitude, loc.id)), by = "loc.id") %>%
unique()
dim(dist.test.nearest)
[1] 20 4
ggmap(nyc_map) +
geom_point(data = dist.test.nearest, aes(x = longitude, y = latitude, color = nearest.stat), size = 3) +
scale_color_viridis(option = "B") +
ylim(40.55, 40.915568) + # NYC city limits latitude coordinates
xlim(-74.1, -73.699215) + # NYC city limits longitude coordinates
xlab("Longitude") +
ylab("Latitude") +
ggtitle("Direct distance to nearest subway station from point in NYC")
Warning: Removed 1 rows containing missing values (geom_rect).
Test run with a small fraction of the data seems to have worked, now to tackle the full data.
(It’s currently set to not evaluate)
I exported the distance to any nearest subway station to any point and the distance to the nearest ada subway station into csv files. They’re imported in the next chunk.
glimpse(nyc311_loc)
Observations: 103,510
Variables: 3
$ borough <chr> "BROOKLYN", "QUEENS", "MANHATTAN", "BROOKLYN", "QUEE...
$ latitude <dbl> 40.60375, 40.76693, 40.79832, 40.62297, 40.68408, 40...
$ longitude <dbl> -73.99558, -73.88903, -73.94185, -73.91094, -73.8448...
glimpse(subloc_sml)
Observations: 485
Variables: 6
$ station_id <chr> "IND / 8 Avenue / 103rd St", "IRT / Broadway-7th Av...
$ borough <chr> "MANHATTAN", "MANHATTAN", "QUEENS", "MANHATTAN", "Q...
$ avg_lat <dbl> 40.79609, 40.79945, 40.74986, 40.79060, 40.69518, 4...
$ avg_long <dbl> -73.96145, -73.96838, -73.86270, -73.94748, -73.844...
$ routes <chr> "BC", "1", "7", "6", "JZ", "A", "6", "23", "J", "7"...
$ ada <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FA...
nyc311_loc <- nyc311_loc %>%
mutate(
borough = ifelse(
borough == "MANHATTAN", "MANHATTAN",
ifelse(
borough == "BRONX", "BRONX",
ifelse(
borough == "BROOKLYN", "BROOKLYN_QUEENS", "BROOKLYN_QUEENS"
)
)
)
)
subloc_sml <- subloc_sml %>%
mutate(
borough = ifelse(
borough == "MANHATTAN", "MANHATTAN",
ifelse(
borough == "BRONX", "BRONX",
ifelse(
borough == "BROOKLYN", "BROOKLYN_QUEENS", "BROOKLYN_QUEENS"
)
)
)
)
set.seed(2001)
test <- nyc311_loc %>%
sample_frac(0.001, replace = FALSE) %>%
full_join(subloc_sml, by = "borough")
distances <- test %>%
mutate(
dist.to.stat = distGeo(
# p1 = NYC 311 location
p1 = as.matrix(test %>% select(longitude, latitude)),
# p2 = subway station location
p2 = as.matrix(test %>% select(avg_long, avg_lat))
) * 0.000621371 # convert meters to miles
) %>%
mutate(loc_id = paste(borough, latitude, longitude, sep = " / "))
min.distance <- distances %>%
group_by(loc_id) %>%
summarize(nearest.stat = min(dist.to.stat)) %>%
ungroup() %>%
inner_join((distances %>% select(longitude, latitude, loc_id)), by = "loc_id") %>%
unique()
nyc_loc_full_join <- nyc311_loc %>%
full_join(subloc_sml, by = "borough")
p1 <- as.matrix(nyc_loc_full_join %>% select(longitude, latitude))
p2 <- as.matrix(nyc_loc_full_join %>% select(avg_long, avg_lat))
dist.to.stat <- distGeo(
# p1 = NYC 311 location
p1 = p1,
# p2 = subway station location
p2 = p2
) * 0.000621371
nyc_loc_full_join$dist.to.stat <- dist.to.stat
nyc_loc_full_join$loc_id <- paste(nyc_loc_full_join$borough, nyc_loc_full_join$latitude, nyc_loc_full_join$longitude, sep = " / ")
nyc_loc_full_join %>%
summarize(
mean.dist = mean(dist.to.stat),
med.dist = median(dist.to.stat)
)
# A tibble: 1 x 2
mean.dist med.dist
<dbl> <dbl>
1 6.09 5.61
nyc_loc_full_join %>%
ggplot(aes(dist.to.stat)) +
geom_histogram(bins = 100) +
geom_vline(xintercept = mean(dist.to.stat), color = "#E69F00", size = 2, alpha = 0.8) +
geom_vline(xintercept = median(dist.to.stat), color = "#56B4E9", size = 2, alpha = 0.8) +
ggtitle("Histogram of distances to all stations") +
xlab("Distance from point in NYC to subway station\nWithin burrough")
nyc_loc_full_join %>%
ggplot(aes(sqrt(dist.to.stat))) +
geom_histogram(bins = 100) +
geom_vline(xintercept = mean(sqrt(dist.to.stat)), color = "#E69F00", size = 2, alpha = 0.8) +
geom_vline(xintercept = median(sqrt(dist.to.stat)), color = "#56B4E9", size = 2, alpha = 0.8) +
ggtitle("Histogram of square root of distances to all stations") +
xlab("Square Root of distance from point in NYC to subway station\nWithin burrough")
nyc_loc_full_join %>%
ggplot(aes(dist.to.stat)) +
geom_histogram(bins = 100) +
geom_vline(xintercept = mean(dist.to.stat), color = "#E69F00", size = 2, alpha = 0.8) +
geom_vline(xintercept = median(dist.to.stat), color = "#56B4E9", size = 2, alpha = 0.8) +
ggtitle("Histogram of distances to ADA-Accessible stations only") +
xlab("Distance from point in NYC to ADA-accessible subway station\nWithin burrough") +
facet_wrap(~ada, nrow = 1)
all.distance <- nyc_loc_full_join %>%
select(-c(station_id, routes)) %>%
distinct() %>%
group_by(loc_id, borough, latitude, longitude) %>%
summarize(
nearest.stat = min(dist.to.stat),
mean.dist = mean(dist.to.stat),
med.dist = median(dist.to.stat)
) %>%
ungroup()
all.distance %>%
select(loc_id, nearest.stat:med.dist) %>%
gather(key = "Dist.type", value = "Distance", -loc_id) %>%
ggplot(aes(Distance)) +
geom_histogram(bins = 50) +
facet_wrap(~ Dist.type, nrow = 2)
all.distance %>%
select(loc_id, nearest.stat:med.dist) %>%
gather(key = "Dist.type", value = "Distance", -loc_id) %>%
ggplot(aes(Distance)) +
geom_histogram(bins = 50) +
facet_wrap(~ Dist.type, nrow = 1)
all.distance %>%
select(loc_id, borough, mean.dist:med.dist) %>%
gather(key = "Dist.type", value = "Distance", mean.dist:med.dist) %>%
ggplot(aes(Distance)) +
geom_histogram(bins = 50) +
facet_wrap(Dist.type ~ borough)
ada.distance <- nyc_loc_full_join %>%
filter(ada) %>%
select(-c(station_id, routes)) %>%
distinct() %>%
group_by(loc_id, borough, latitude, longitude) %>%
summarize(
nearest.ada.stat = min(dist.to.stat),
mean.ada.stat = mean(dist.to.stat),
med.ada.stat = median(dist.to.stat)
) %>%
ungroup()
ada.distance %>%
select(loc_id, nearest.ada.stat:med.ada.stat) %>%
gather(key = "Dist.type", value = "Distance", -loc_id) %>%
ggplot(aes(Distance)) +
geom_histogram(bins = 50) +
facet_wrap(~ Dist.type, nrow = 2)
ada.distance %>%
select(loc_id, mean.ada.stat:med.ada.stat) %>%
gather(key = "Dist.type", value = "Distance", -loc_id) %>%
ggplot(aes(Distance)) +
geom_histogram(bins = 50) +
facet_wrap(~ Dist.type, nrow = 1)
ada.distance %>%
select(loc_id, borough, mean.ada.stat:med.ada.stat) %>%
gather(key = "Dist.type", value = "Distance", mean.ada.stat:med.ada.stat) %>%
ggplot(aes(Distance)) +
geom_histogram(bins = 50) +
facet_wrap(Dist.type ~ borough)
### top 5
all.top5.distance <- nyc_loc_full_join %>%
select(-c(station_id, routes)) %>%
distinct() %>%
group_by(loc_id, borough, latitude, longitude) %>%
top_n(-5, dist.to.stat) %>%
summarize(
mean.top5.all = mean(dist.to.stat),
med.top5.all = median(dist.to.stat)
) %>%
ungroup()
ada.top5.distance <- nyc_loc_full_join %>%
filter(ada) %>%
select(-c(station_id, routes)) %>%
distinct() %>%
group_by(loc_id, borough, latitude, longitude) %>%
top_n(-5, dist.to.stat) %>%
summarize(
mean.top5.ada = mean(dist.to.stat),
med.top5.ada = median(dist.to.stat)
) %>%
ungroup()
all.top5.distance %>%
select(loc_id, mean.top5.all:med.top5.all) %>%
gather(key = "Dist.type", value = "Distance", -loc_id) %>%
ggplot(aes(Distance)) +
geom_histogram(bins = 100) +
facet_wrap(~Dist.type)
ada.top5.distance %>%
select(loc_id, mean.top5.ada:med.top5.ada) %>%
gather(key = "Dist.type", value = "Distance", -loc_id) %>%
ggplot(aes(Distance)) +
geom_histogram(bins = 100) +
facet_wrap(~Dist.type)
Distance to any type of station from a particular point in NYC:
all.distance %>%
ggplot(aes(x = longitude, y = latitude, color = nearest.stat)) +
geom_point(size = 0.2, alpha = 0.2) +
scale_color_viridis(option = "B") +
theme(panel.background = element_rect(fill = 'grey50')) +
coord_quickmap() +
ggtitle("Direct distance to any nearest subway station from point in NYC")
all.distance %>%
ggplot(aes(x = longitude, y = latitude, color = mean.dist)) +
geom_point(size = 0.2, alpha = 0.2) +
scale_color_viridis(option = "B") +
theme(panel.background = element_rect(fill = 'grey50')) +
coord_quickmap() +
ggtitle("Direct distance to any nearest subway station from point in NYC")
ada.distance %>%
ggplot(aes(x = longitude, y = latitude, color = nearest.ada.stat)) +
geom_point(size = 0.25, alpha = 0.2) +
scale_color_viridis(option = "B") +
theme(panel.background = element_rect(fill = 'grey50')) +
coord_quickmap() +
ggtitle("Direct distance to nearest ADA-Accessible subway station from point in NYC")
ada.distance %>%
ggplot(aes(x = longitude, y = latitude, color = mean.ada.stat)) +
geom_point(size = 0.25, alpha = 0.2) +
scale_color_viridis(option = "B") +
theme(panel.background = element_rect(fill = 'grey50')) +
coord_quickmap() +
ggtitle("Direct distance to nearest ADA-Accessible subway station from point in NYC")
all.top5.distance %>%
ggplot(aes(x = longitude, y = latitude, color = mean.top5.all)) +
geom_point(size = 0.2, alpha = 0.2) +
scale_color_viridis(option = "B") +
theme(panel.background = element_rect(fill = 'grey50')) +
coord_quickmap()
ada.top5.distance %>%
ggplot(aes(x = longitude, y = latitude, color = mean.top5.ada)) +
geom_point(size = 0.2, alpha = 0.2) +
scale_color_viridis(option = "B") +
theme(panel.background = element_rect(fill = 'grey50')) +
coord_quickmap()
ggmap(nyc_map) +
geom_point(data = all.distance, aes(x = longitude + 0.002, y = latitude - 0.002, color = mean.dist), size = 0.4, alpha = 0.5) +
scale_color_viridis(option = "B") +
ylim(40.55, 40.915568) + # NYC city limits latitude coordinates
xlim(-74.1, -73.699215) + # NYC city limits longitude coordinates
xlab("longitude") +
ylab("latitude") +
ggtitle("Direct distance to any nearest subway station from point in NYC")
Warning: Removed 1 rows containing missing values (geom_rect).
Warning: Removed 8 rows containing missing values (geom_point).
Unsurprisingly, some areas are just far from any subway stations and that carries over into what areas are far from ADA stations.
dist.merge <- all.distance %>%
inner_join(
ada.distance %>% select(-(borough:longitude)),
by = "loc_id"
) %>%
inner_join(
all.top5.distance %>% select(-(borough:longitude)),
by = "loc_id"
) %>%
inner_join(
ada.top5.distance %>% select(-(borough:longitude)),
by = "loc_id"
)
glimpse(dist.merge)
Observations: 103,506
Variables: 14
$ loc_id <chr> "BRONX / 40.7560423362848 / -73.9257889541193...
$ borough <chr> "BRONX", "BRONX", "BRONX", "BRONX", "BRONX", ...
$ latitude <dbl> 40.75604, 40.79847, 40.79871, 40.79892, 40.79...
$ longitude <dbl> -73.92579, -73.91072, -73.91114, -73.91147, -...
$ nearest.stat <dbl> 3.4589419, 0.5070450, 0.4838138, 0.4651185, 0...
$ mean.dist <dbl> 6.857871, 3.931517, 3.922247, 3.914472, 3.883...
$ med.dist <dbl> 6.853549, 3.994368, 3.990962, 3.987878, 3.945...
$ nearest.ada.stat <dbl> 4.1661391, 1.2720243, 1.2495533, 1.2312540, 1...
$ mean.ada.stat <dbl> 6.805051, 3.901289, 3.889184, 3.879117, 3.854...
$ med.ada.stat <dbl> 7.480305, 4.468260, 4.454069, 4.442268, 4.417...
$ mean.top5.all <dbl> 3.6991168, 0.8329043, 0.8136531, 0.7981331, 0...
$ med.top5.all <dbl> 3.7561349, 0.7706210, 0.7440756, 0.7225358, 0...
$ mean.top5.ada <dbl> 5.313664, 2.419564, 2.401859, 2.387339, 2.376...
$ med.top5.ada <dbl> 4.964894, 2.176926, 2.153263, 2.133949, 2.140...
# sanity checks for joining:
sapply(all.distance, anyNA)
loc_id borough latitude longitude nearest.stat
FALSE FALSE FALSE FALSE FALSE
mean.dist med.dist
FALSE FALSE
sapply(ada.distance, anyNA)
loc_id borough latitude longitude
FALSE FALSE FALSE FALSE
nearest.ada.stat mean.ada.stat med.ada.stat
FALSE FALSE FALSE
sapply(dist.merge, anyNA)
loc_id borough latitude longitude
FALSE FALSE FALSE FALSE
nearest.stat mean.dist med.dist nearest.ada.stat
FALSE FALSE FALSE FALSE
mean.ada.stat med.ada.stat mean.top5.all med.top5.all
FALSE FALSE FALSE FALSE
mean.top5.ada med.top5.ada
FALSE FALSE
# joining seems ok
dist.merge <- dist.merge %>%
mutate(
# normalize dist to nearest station by subtraction
any.vs.ada.min.dist = nearest.ada.stat - nearest.stat,
# normalize dist to nearest station by division
any.vs.ada.min.ratio = nearest.ada.stat / nearest.stat,
# repeat for means
any.vs.ada.mean.dist = mean.ada.stat - mean.dist,
any.vs.ada.mean.ratio = mean.ada.stat / mean.dist,
# repeat for med
any.vs.ada.med.dist = med.ada.stat - med.dist,
any.vs.ada.med.ratio = med.ada.stat / med.dist,
# repeat for closest 5 stations means
top5.any.vs.ada.mean.dist = mean.top5.ada - mean.top5.all,
top5.any.vs.ada.mean.ratio = mean.top5.ada / mean.top5.all,
# repeat for closes 5 station medians
top5.any.vs.ada.med.dist = med.top5.ada - med.top5.all,
top5.any.vs.ada.med.ratio = med.top5.ada / med.top5.all
) %>%
select(loc_id:longitude, starts_with("any.vs.ada"), starts_with("top5.any.vs.ada"))
glimpse(dist.merge)
Observations: 103,506
Variables: 14
$ loc_id <chr> "BRONX / 40.7560423362848 / -73.925...
$ borough <chr> "BRONX", "BRONX", "BRONX", "BRONX",...
$ latitude <dbl> 40.75604, 40.79847, 40.79871, 40.79...
$ longitude <dbl> -73.92579, -73.91072, -73.91114, -7...
$ any.vs.ada.min.dist <dbl> 0.7071972, 0.7649793, 0.7657395, 0....
$ any.vs.ada.min.ratio <dbl> 1.204455, 2.508701, 2.582715, 2.647...
$ any.vs.ada.mean.dist <dbl> -0.05282031, -0.03022794, -0.033062...
$ any.vs.ada.mean.ratio <dbl> 0.9922979, 0.9923114, 0.9915705, 0....
$ any.vs.ada.med.dist <dbl> 0.6267557, 0.4738921, 0.4631062, 0....
$ any.vs.ada.med.ratio <dbl> 1.091450, 1.118640, 1.116039, 1.113...
$ top5.any.vs.ada.mean.dist <dbl> 1.614547, 1.586659, 1.588206, 1.589...
$ top5.any.vs.ada.mean.ratio <dbl> 1.436468, 2.904972, 2.951945, 2.991...
$ top5.any.vs.ada.med.dist <dbl> 1.208759, 1.406305, 1.409188, 1.411...
$ top5.any.vs.ada.med.ratio <dbl> 1.321809, 2.824898, 2.893877, 2.953...
Histogram of the distances:
dist.merge %>%
select(loc_id:borough, ends_with(".dist")) %>%
rename(
"Minimum_Diff" = any.vs.ada.min.dist,
"Mean_Diff" = any.vs.ada.mean.dist,
"Med_Diff" = any.vs.ada.med.dist,
"Closest_5Stations_Mean_Diff" = top5.any.vs.ada.mean.dist,
"Closest_5Stations_Med_Diff" = top5.any.vs.ada.med.dist) %>%
gather(key = "Dist.type", value = "Norm_Distance", -(loc_id:borough)) %>%
ggplot(aes(x = Norm_Distance)) +
geom_histogram(bins = 100) +
facet_wrap(~Dist.type) +
xlab("Normalized Distance (miles)") +
ggtitle("Difference in distance\nBetween nearest ADA-Accessible subway station and any subway station")
dist.merge %>%
ggplot(aes(any.vs.ada.min.dist)) +
geom_histogram(bins = 100)
dist.merge %>%
ggplot(aes(any.vs.ada.med.dist)) +
geom_histogram(bins = 100)
dist.merge %>%
ggplot(aes(top5.any.vs.ada.mean.dist)) +
geom_histogram(bins = 100)
dist.merge %>%
ggplot(aes(top5.any.vs.ada.med.dist)) +
geom_histogram(bins = 100)
dist.merge %>%
select(loc_id:borough, ends_with(".ratio")) %>%
select(-any.vs.ada.min.ratio) %>%
rename(
"Mean_Ratio" = any.vs.ada.mean.ratio,
"Med_Ratio" = any.vs.ada.med.ratio,
"Closest_5Stations_Mean_Ratio" = top5.any.vs.ada.mean.ratio,
"Closest_5Stations_Med_Ratio" = top5.any.vs.ada.med.ratio
) %>%
gather(key = "Dist.type", value = "Norm_Distance", -(loc_id:borough)) %>%
ggplot(aes(x = Norm_Distance)) +
geom_histogram(bins = 100) +
facet_wrap(~Dist.type) +
xlab("Ratio") +
ggtitle("Ratio in distance\nBetween nearest ADA-Accessible subway station and any subway station")
dist.merge %>%
ggplot(aes(any.vs.ada.mean.ratio)) +
geom_histogram(bins = 100)
dist.merge %>%
ggplot(aes(any.vs.ada.med.ratio)) +
geom_histogram(bins = 100)
dist.merge %>%
ggplot(aes(top5.any.vs.ada.mean.ratio)) +
geom_histogram(bins = 100)
dist.merge %>%
ggplot(aes(top5.any.vs.ada.med.ratio)) +
geom_histogram(bins = 100)
Maps of the normalized distances to ADA-Accessible stations:
### subtraction normalization
dist.merge %>%
ggplot(aes(x = longitude, y = latitude, color = any.vs.ada.min.dist)) +
geom_point(size = 0.1) +
scale_color_viridis(option = "B") +
theme(panel.background = element_rect(fill = 'grey50')) +
coord_quickmap() +
ggtitle("Direct distance to nearest ADA-accessible subway station from point in NYC\nNormalized to distance to nearest any kind of subway station (subtraction)") +
xlab("Longitude") +
ylab("Latitude")
dist.merge %>%
ggplot(aes(x = longitude, y = latitude, color = any.vs.ada.min.dist)) +
geom_point(size = 0.2, alpha = 0.3) +
scale_color_viridis(option = "B") +
theme(panel.background = element_rect(fill = 'grey50')) +
coord_quickmap() +
ggtitle("Direct distance to nearest ADA-accessible subway station from point in NYC\nNormalized to distance to nearest any kind of subway station (subtraction)") +
xlab("Longitude") +
ylab("Latitude")
dist.merge %>%
ggplot(aes(x = longitude, y = latitude, color = any.vs.ada.med.ratio)) +
geom_point(size = 0.2, alpha = 0.3) +
scale_color_viridis(option = "B") +
theme(panel.background = element_rect(fill = 'grey50')) +
coord_quickmap()
dist.merge %>%
ggplot(aes(x = longitude, y = latitude, color = top5.any.vs.ada.mean.dist)) +
geom_point(size = 0.2, alpha = 0.3) +
scale_color_viridis(option = "B") +
theme(panel.background = element_rect(fill = 'grey50')) +
coord_quickmap()
dist.merge %>%
ggplot(aes(x = longitude, y = latitude, color = top5.any.vs.ada.mean.ratio)) +
geom_point(size = 0.2, alpha = 0.3) +
scale_color_viridis(option = "B") +
theme(panel.background = element_rect(fill = 'grey50')) +
coord_quickmap()
What are the rough locations under-serviced? Google map can help here, even though it’s not perfect.
ggmap(nyc_map) +
geom_point(data = dist.merge, aes(x = longitude + 0.002, y = latitude - 0.002, color = any.vs.ada.min.dist), size = 0.2) +
scale_color_viridis(option = "B") +
ylim(40.55, 40.915568) + # NYC city limits latitude coordinates
xlim(-74.1, -73.699215) + # NYC city limits longitude coordinates
xlab("Longitude") +
ylab("Latitude") +
ggtitle("Direct distance to nearest ADA-accessible subway station from point in NYC\nNormalized to distance to nearest any kind of subway station (subtraction)")
Warning: Removed 1 rows containing missing values (geom_rect).
Warning: Removed 8 rows containing missing values (geom_point).
The ratio normalization doesn’t really work so well, the subtraction is probably the better choice in this case.
Next steps: - Separate distance calculations by borough for both the random point and the station, in case “closest” stations are across a river. Probably can keep Queens and Brooklyn together? - Incorporate outage data to find the “true” dead zones? Or areas that are usually suffering from long-term outages of accessibility equipment? - Find a better NYC map overlay - is it the 311 long,lat data that’s the problem, or the map I got from ggmap?
test.map <- readOGR("./data/nyct2010_18a", "nyct2010")
## OGR data source with driver: ESRI Shapefile
## Source: "C:\Users\Dasha\Documents\GitHub\subway_accessibility_map\data\nyct2010_18a", layer: "nyct2010"
## with 2166 features
## It has 11 fields
tm_shape(test.map) +
tm_borders()
test.map2 <- fortify(test.map)
ggplot(test.map2, aes(x = long, y = lat, group = group)) +
geom_polygon(fill = NA, color = "black")
test.map3 <- test.map[test.map$BoroName != "Staten Island", ]
tm_shape(test.map3) +
tm_borders()
dist.points <- SpatialPoints(dist.merge[, c(4, 3)])
proj4string(dist.points) <- CRS("+init=epsg:4326")
#proj4string(dist.points) <- CRS("+proj=lcc +lat_1=41.03333333333333 +lat_2=40.66666666666666 +lat_0=40.16666666666666 +lon_0=-74 +x_0=300000.0000000001 +y_0=0 +ellps=GRS80 +datum=NAD83 +to_meter=0.3048006096012192 +no_defs")
plot(dist.points)
dist.points <- spTransform(dist.points, proj4string(test.map3))
dist.by.tract <- over(dist.points, test.map3)
dist.by.tract.frt <- fortify(dist.by.tract)
test4 <- dist.merge %>%
bind_cols(dist.by.tract.frt)
test5 <- test4 %>%
group_by(CT2010) %>%
summarize(avg_to_ada_stat = mean(top5.any.vs.ada.mean.dist))
test6 <- test4 %>%
group_by(NTAName) %>%
summarize(avg_to_ada_stat = mean(top5.any.vs.ada.mean.dist)) %>%
arrange(desc(avg_to_ada_stat))
merge.to.map <- merge(test.map3, test5, by.x = "CT2010", by.y = "CT2010")
merge.to.map2 <- merge.to.map
merge.to.map2[str_detect(merge.to.map2$NTAName, "park-cemetery-etc") | str_detect(merge.to.map2$NTAName, "Airport"), ] <- NA
tm_shape(merge.to.map2) +
tm_fill("avg_to_ada_stat", style = "fixed", breaks = c(0, 0.25, 0.5, 0.75, 1, 1.25, 1.5, 3, 5.9643), palette = viridis(n = 8, option = "A"))
tm_shape(merge.to.map2) +
tm_fill("avg_to_ada_stat", style = "fixed", breaks = c(0, 0.25, 0.5, 0.75, 1, 1.25, 1.5, 3, 5.9643), palette = viridis(n = 8, option = "B"))
tm_shape(merge.to.map2) +
tm_fill("avg_to_ada_stat", style = "fixed", breaks = c(0, 0.25, 0.5, 0.75, 1, 1.25, 1.5, 3, 5.9643), palette = viridis(n = 8, option = "C"))
tm_shape(merge.to.map2) +
tm_fill("avg_to_ada_stat", style = "fixed", breaks = c(0, 0.25, 0.5, 0.75, 1, 1.25, 1.5, 3, 5.9643), palette = viridis(n = 8, option = "D"))
tm_shape(merge.to.map2) +
tm_fill("avg_to_ada_stat", style = "fixed", breaks = c(0, 0.25, 0.5, 0.75, 1, 1.25, 1.5, 3, 5.9643), palette = viridis(n = 8, option = "E"))
tm_shape(merge.to.map2) +
tm_borders() +
tm_fill("avg_to_ada_stat", style = "fixed", breaks = c(0, 0.25, 0.5, 0.75, 1, 1.25, 1.5, 3, 5.9643), palette = viridis(n = 8, option = "A"))
merge.to.map.neigh <- merge(test.map3, test6, by.x = "NTAName", by.y = "NTAName")
merge.to.map.neigh[str_detect(merge.to.map.neigh$NTAName, "park-cemetery-etc") | str_detect(merge.to.map.neigh$NTAName, "Airport"), ] <- NA
tm_shape(merge.to.map.neigh) +
tm_borders()+
tm_fill("avg_to_ada_stat", midpoint = NA, palette = viridis(n = 6, option = "A"))
sum(str_detect(merge.to.map$NTAName, "park-cemetery-etc"))
## [1] 39
merge.parks <- merge.to.map[str_detect(merge.to.map$NTAName, "park-cemetery-etc"),]
tm_shape(merge.parks) +
tm_fill()